
!pip install folium
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime
import datetime
import statsmodels.formula.api as sm
original = pd.read_excel('traffic_sample.xlsx')
# Converting the Latitude and Longitude Attributes to a Float
original["Latitude"] = original["Latitude"].astype(float)
original["Longitude"] = original["Longitude"].astype(float)
original = original[original["Gender"] != "U"]
original = original[(original["Year"] != 0) & (original["Year"] < 2020) & (original["Year"] > 1900)]
original.head(n = 25)
filtered_cols = ["Date Of Stop", "Time Of Stop", "SubAgency",
"Description", "Location", "Latitude", "Longitude",
"Violation Type", "Race", "Gender"]
# Can break up the criteria above to make the dataframe more tidy
sam = original[filtered_cols].copy()
# Auto Generate an empty with the location of Montgomery County Maryland
def generate_map(loc = [39.1247, -77.1905], zoom = 10.5, tile = "openstreetmap"):
res_map = folium.Map(location = loc, zoom_start = zoom, control_scale = True, tiles = tile)
# Add the Tile (or Style) of the Map
folium.TileLayer('openstreetmap').add_to(res_map)
folium.TileLayer('Stamen Watercolor').add_to(res_map)
folium.TileLayer('Stamen Toner').add_to(res_map)
return res_map
# This Function returns the designated color assigned to a race.
def color_select(race):
ethnicity = {'ASIAN': "#ed8134", # Orange
'BLACK': "#391cba", #Indigo
'HISPANIC': "#119992", #Teal
'NATIVE AMERICAN': "#9412b8", # Violet
'OTHER': "#127bb8", # Blue
'WHITE': "#e81c1c"} # Red
return ethnicity[race]
# Creating an Empty Map
map_total = generate_map()
# Create Different Layers for each race
asian_fg = folium.FeatureGroup(name = "Asian")
black_fg = folium.FeatureGroup(name = "Black")
his_fg = folium.FeatureGroup(name = "Hispanic")
na_fg = folium.FeatureGroup(name = "Native American")
other_fg = folium.FeatureGroup(name = "Other")
white_fg = folium.FeatureGroup(name = "White")
# Making a hash where the key are the race and the value are
# the respective layer
race = {'ASIAN': asian_fg,
'BLACK': black_fg,
'HISPANIC': his_fg,
'NATIVE AMERICAN': na_fg,
'OTHER': other_fg,
'WHITE': white_fg}
# Creating a Legend for the Map
legend_html = '''
<style>
.circle {
height: 10px;
width: 10px;
background-color: orange;
border-radius: 50%;
}
.square {
height: 10px;
width: 10px;
background-color: #ed8134;
}
div {
display: inline-block;
}
legend {
font-size: 13px
}
.triangle {
width: 0;
height: 0;
border-left: 7.5px solid transparent;
border-right: 7.5px solid transparent;
border-bottom: 15px solid #ed8134;
}
</style>
<div style="position: fixed;
left: 50px; width: 150px;
border:2px solid black; z-index:9999; font-size:12px; background-color: white;">
<legend><b>Legend:</b></legend>
<b>Race: </b><br>
Asian: <div class = circle style = "background-color: #ed8134"> </div> <br>
White: <div class = circle style = "background-color: #e81c1c"> </div><br>
Black: <div class = circle style = "background-color: #391cba"> </div><br>
Hispanic: <div class = circle style = "background-color: #119992"> </div><br>
Native American: <div class = circle style = "background-color: #9412b8"> </div><br>
Other: <div class = circle style = "background-color: #127bb8"> </div>
<hr>
<b>Gender: </b><br>
Male: <div class = triangle> </div> <br>
Female: <div class = square> </div> <br>
</div>
'''
map_total.get_root().html.add_child(folium.Element(legend_html))
for ind, row in sam.iterrows():
entry = (folium.RegularPolygonMarker(location = [row["Latitude"],row["Longitude"]], popup = row["Description"],
color= color_select(row["Race"]), fill = True, weight = 1,
number_of_sides = 3 if row["Gender"] == "M" else 4,
radius = 4, opactity = .4))
entry.add_to(race[row["Race"]])
for r in race:
race[r].add_to(map_total)
folium.LayerControl().add_to(map_total)
display(map_total)
gr_df = sam.copy()
gr_df["count"] = 1
aggregation_functions = {'count': 'sum'}
nd = gr_df.groupby(['Gender', 'Race']).aggregate(aggregation_functions)
# Setting up the plot and dimension
fig, axs = plt.subplots()
fig.set_figheight(30)
fig.set_figwidth(40)
b1 = sns.barplot(x="Gender", y ="count", hue="Race", palette = "Spectral", data=nd.reset_index(), ax = axs)
b1.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
fancybox=True, shadow=True, ncol=3, labelspacing=2, fontsize = 20)
b1.set_title("The Occurrence of Traffic Violation Based on Gender and Race", fontsize = 40)
b1.set_ylabel("Count", fontsize = 30)
b1.set_xlabel("Gender", fontsize = 30)
b1.tick_params(axis='both', labelsize=25)
plt.show()
sam["hour"] = [t.hour for t in sam["Time Of Stop"]]
cut = pd.cut(sam["hour"], bins = [0,2,4,6,8,10,12,14,16,18,20,22,24],
labels = [1,2,3,4,5,6,7,8,9,10,11,12], right = False)
sam["cut"] = cut
df_copy = sam.copy()
df_copy['count'] = 1
hr_map = generate_map()
hm_fg = []
hr = 0
for ind in range(12):
temp_name = "Hours " + str(hr) + " to " + str(hr + 1)
hm_fg.append(folium.FeatureGroup(name = temp_name, show= True if ind == 0 else False))
hr += 2
# Group time together to have more during a specifc set of hours\
for index in range(12):
temp = df_copy[df_copy["cut"] == index + 1]
HeatMap(data=temp[['Latitude', 'Longitude', 'count']]
.groupby(['Latitude', 'Longitude', 'count'])
.sum()
.reset_index()
.values.tolist(),
radius=8, max_zoom=13).add_to(hm_fg[index])
for fg in hm_fg:
fg.add_to(hr_map)
folium.LayerControl().add_to(hr_map)
display(hr_map)
You can filter what time the heatmap is showing using the layer tool at the top right corner of the map.
time_map = generate_map()
df_hour_list = []
for hour in df_copy["cut"].sort_values().unique():
df_hour_list.append(df_copy.loc[df_copy.hour == hour, ['Latitude', 'Longitude', 'count']]
.groupby(['Latitude', 'Longitude']).sum().reset_index().values.tolist())
HeatMapWithTime(df_hour_list, radius=8, gradient={0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'},
min_opacity=0.5, max_opacity=0.8, use_local_extrema=True, auto_play=True).add_to(time_map)
folium.LayerControl().add_to(time_map)
display(time_map)
df_copy = sam.copy()
df_copy['count'] = 1
base_map = generate_map()
HeatMap(data=df_copy[['Latitude', 'Longitude', 'count']]
.groupby(['Latitude', 'Longitude', 'count'])
.sum()
.reset_index()
.values.tolist(),
radius=8, max_zoom=13).add_to(base_map)
folium.LayerControl().add_to(base_map)
display(base_map)
rv = sam.copy()
rv["count"] = 1
aggregation_functions = {'count': 'sum'}
nd = rv.groupby(['Race', 'Violation Type']).aggregate(aggregation_functions)
# Setting up the plot and dimension
fig, axs = plt.subplots()
fig.set_figheight(30)
fig.set_figwidth(40)
r1 = sns.barplot(x="Race", y ="count", hue="Violation Type", palette = ["#ff8378", "#5bc7a7"], data=nd.reset_index(), ax = axs)
r1.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
fancybox=True, shadow=True, ncol=3, labelspacing=2, fontsize = 20)
r1.set_title("The Occurrence of Traffic Violation and Violation Type Based on Race", fontsize = 40)
r1.set_ylabel("Count", fontsize = 30)
r1.set_xlabel("Race", fontsize = 30)
r1.tick_params(axis='both', labelsize=25)
plt.show()
gv = sam.copy()
gv["count"] = 1
aggregation_functions = {'count': 'sum'}
nd = gv.groupby(['Gender', 'Violation Type']).aggregate(aggregation_functions)
# Setting up the plot and dimension
fig, axs = plt.subplots()
fig.set_figheight(30)
fig.set_figwidth(40)
g1 = sns.barplot(x="Gender", y ="count", hue="Violation Type", palette = ["#ff8378", "#5bc7a7"], data=nd.reset_index(), ax = axs)
g1.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
fancybox=True, shadow=True, ncol=3, labelspacing=2, fontsize = 20)
g1.set_title("The Occurrence of Traffic Violation and Violation Type Based on Gender", fontsize = 40)
g1.set_ylabel("Count", fontsize = 30)
g1.set_xlabel("Gender", fontsize = 30)
g1.tick_params(axis='both', labelsize=25)
plt.show()
data_reg = sam.copy()
vt = {"Warning": 1,
"Citation": 2,
"ESERO": 3,
"SERO": 4}
data_reg["violation_type_num"] = [vt[v] for v in data_reg["Violation Type"]]
data_reg = pd.get_dummies(data_reg, columns = ["Gender"])
data_reg = pd.get_dummies(data_reg, columns = ["Race"])
data_reg["Race_NATIVE"] = data_reg["Race_NATIVE AMERICAN"] # Rename the column Race_NATIVE AMERICAN to Race_NATIVE
data_reg.head()
distlr = sm.ols(formula = 'violation_type_num ~ hour + Race_ASIAN + Race_BLACK + + Race_WHITE + Race_HISPANIC + Race_OTHER + Race_NATIVE + Gender_F + Gender_M', data = data_reg).fit()
distlr.summary()
# Setting up the plot and dimension
fig, axs = plt.subplots(nrows = 1)
fig.set_figheight(10)
fig.set_figwidth(20)
predict = distlr.predict({"hour": data_reg["hour"],"Gender_F": data_reg['Gender_F'],
"Gender_M": data_reg['Gender_M'], "Race_ASIAN": data_reg['Race_ASIAN'],
"Race_BLACK": data_reg['Race_BLACK'], "Race_WHITE": data_reg['Race_WHITE'],
"Race_HISPANIC": data_reg['Race_HISPANIC'], "Race_OTHER": data_reg['Race_OTHER'],
"Race_NATIVE": data_reg['Race_NATIVE']})
resid = data_reg["violation_type_num"] - predict
d1 = sns.violinplot(x = data_reg["hour"], y = resid, ax = axs)
d1.set_title("Violin Plot of Residuals vs. Hour for the Multiple Linear Regression Model", fontsize = 20)
d1.set_ylabel("Residual", fontsize = 15)
d1.set_xlabel("Hour", fontsize = 15)
d1.tick_params(axis='both', labelsize=15)
plt.show()
In our model, it predicted value between 1 and 2 (Technically also 3 for ESERO and 4 for SERO, but they are very rare), meaning that it can predict a decimal value such as 1.5. This does not make sense because there are only two options, either 1 for warning or 2 for citation. In order to make sense of the prediction, we rounded any predictions less than 1.5 to 1 and any predictions greater than or equal to 1.5 to 2.
rounded = []
for p in predict:
if p < 1.5:
rounded.append(1)
elif p < 2.5:
rounded.append(2)
elif p < 3.5:
rounded.append(3)
else:
rounded.append(4)
# Setting up the plot and dimension
fig, axs = plt.subplots(nrows = 1)
fig.set_figheight(10)
fig.set_figwidth(20)
resid = data_reg["violation_type_num"] - rounded
d2 = sns.violinplot(x = data_reg["hour"], y = resid, ax = axs)
d2.set_title("Violin Plot of Residuals vs. Hours for the Multiple Linear Regression Model", fontsize = 20)
d2.set_ylabel("Residual", fontsize = 15)
d2.set_xlabel("Hour", fontsize = 15)
d2.tick_params(axis='both', labelsize=15)
plt.show()
In our violinplot of the residuals vs hours for the multiple linear regression model, a residual value of 0 means that our model predicted the correct violation type and a value of 1 or -1 means that our model predicted wrong.
# Setting up the plot and dimension
fig, axs = plt.subplots(nrows = 1)
fig.set_figheight(10)
fig.set_figwidth(20)
d2 = sns.violinplot(x = sam["Race"], y = resid, ax = axs)
d2.set_title("Violin Plot of Residuals vs. Race for the Multiple Linear Regression Model", fontsize = 20)
d2.set_ylabel("Residual", fontsize = 15)
d2.set_xlabel("Race", fontsize = 15)
d2.tick_params(axis='both', labelsize=15)
plt.show()
# Setting up the plot and dimension
fig, axs = plt.subplots(nrows = 1)
fig.set_figheight(10)
fig.set_figwidth(20)
d2 = sns.violinplot(x = sam["Gender"], y = resid, palette = "coolwarm",ax = axs)
d2.set_title("Violin Plot of Residuals vs. Gender for the Multiple Linear Regression Model", fontsize = 20)
d2.set_ylabel("Residual", fontsize = 15)
d2.set_xlabel("Gender", fontsize = 15)
d2.tick_params(axis='both', labelsize=15)
plt.show()

Drive Safe Out There!